Prepration¶

InĀ [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sys
sys.path.append('..')
from helper import get_latest_table
import warnings
warnings.simplefilter(action='ignore')
InĀ [2]:
current_month = pd.Timestamp.now().month
current_year = pd.Timestamp.now().year

cpu_data = get_latest_table('cpu_specs')
gpu_data = get_latest_table('gpu_specs')

full_relation = get_latest_table('full_relation')
Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful

Preview the data

InĀ [3]:
print(f"CPU Data: {cpu_data.shape[0]} rows, {cpu_data.shape[1]} columns")
print(f"GPU Data: {gpu_data.shape[0]} rows, {gpu_data.shape[1]} columns")
print(f"Full Relation Data: {full_relation.shape[0]} rows, {full_relation.shape[1]} columns")
CPU Data: 2348 rows, 28 columns
GPU Data: 618 rows, 13 columns
Full Relation Data: 1959 rows, 70 columns

Data Analasys¶

CPU Dataframe¶

Preview the data¶

Dataframe head¶

InĀ [4]:
# Display the first few rows
print(cpu_data.head())
                    name performance_clockspeed performance_turbospeed  \
0   intel core i3 1315ue                   1.20                   4.50   
1     intel core i3 n300                   None                   3.80   
2    intel core i3 1305u                   1.60                   4.50   
3      amd ryzen 3 7320u                   2.40                   4.10   
4  intel core i5 1038ng7                   2.00                   3.80   

   performance_cores  performance_threads efficient_clockspeed  \
0                2.0                  4.0                 None   
1                8.0                  8.0                 None   
2                1.0                  2.0                 1.20   
3                4.0                  8.0                 None   
4                4.0                  8.0                 None   

  efficient_turbospeed  efficient_cores  efficient_threads    tdp  ...  \
0                 3.30              4.0                4.0  15.00  ...   
1                 None              NaN                NaN   7.00  ...   
2                 3.30              4.0                4.0  15.00  ...   
3                 None              NaN                NaN  15.00  ...   
4                 None              NaN                NaN  28.00  ...   

   eff_l2_cache  integer_math floating_point_math find_prime_numbers  \
0          None       34537.0             20958.0               51.0   
1          None       29169.0             19343.0               22.0   
2   1 x 2048 kb       27950.0             20052.0               36.0   
3          None       29638.0             14121.0               20.0   
4          None       27545.0             15238.0               28.0   

  random_string_sorting data_encryption data_compression physics  \
0               10759.0          6321.0         103162.0   824.0   
1               12797.0          7034.0         100731.0   516.0   
2               10623.0          6021.0          95060.0   518.0   
3               13922.0          6266.0         131689.0   437.0   
4               11471.0          5714.0         109286.0   698.0   

  extended_instructions  single_thread  
0                5172.0           3269  
1                5174.0           2122  
2                5262.0           3276  
3                5905.0           2378  
4                6539.0           2152  

[5 rows x 28 columns]

Dataframe tail¶

InĀ [5]:
# Display the first few rows
print(cpu_data.tail())
                                name performance_clockspeed  \
2343                     intel u300e                   1.10   
2344  arm huawei,kunpeng 920 24 core                   2.60   
2345             amd custom apu 0932                   2.40   
2346            intel core i7 10710u                   1.10   
2347            intel core i3 1125g4                   2.00   

     performance_turbospeed  performance_cores  performance_threads  \
2343                   4.30                1.0                  2.0   
2344                   None               24.0                 24.0   
2345                   3.50                4.0                  8.0   
2346                   4.70                6.0                 12.0   
2347                   3.70                4.0                  8.0   

     efficient_clockspeed efficient_turbospeed  efficient_cores  \
2343                 None                 3.20              4.0   
2344                 None                 None              NaN   
2345                 None                 None              NaN   
2346                 None                 None              NaN   
2347                 None                 None              NaN   

      efficient_threads    tdp  ...  eff_l2_cache  integer_math  \
2343                4.0  15.00  ...   1 x 2048 kb       30218.0   
2344                NaN   None  ...          None       91062.0   
2345                NaN  15.00  ...          None       28027.0   
2346                NaN  15.00  ...          None       35167.0   
2347                NaN  28.00  ...          None       29716.0   

     floating_point_math find_prime_numbers random_string_sorting  \
2343             21589.0               45.0               11513.0   
2344             30906.0               48.0               40681.0   
2345             17049.0               23.0               14366.0   
2346             21715.0               31.0               16853.0   
2347             18257.0               34.0               12839.0   

     data_encryption data_compression physics extended_instructions  \
2343          6421.0          98379.0   599.0                5279.0   
2344          2447.0          94224.0   822.0               10829.0   
2345          7582.0         117043.0   613.0                6566.0   
2346          3269.0         128017.0   642.0                8051.0   
2347          5666.0         107758.0   577.0                7990.0   

      single_thread  
2343           3546  
2344            733  
2345           2263  
2346           2336  
2347           2476  

[5 rows x 28 columns]

Check all the features¶

InĀ [6]:
print(cpu_data.columns)
Index(['name', 'performance_clockspeed', 'performance_turbospeed',
       'performance_cores', 'performance_threads', 'efficient_clockspeed',
       'efficient_turbospeed', 'efficient_cores', 'efficient_threads', 'tdp',
       'multithread_rating', 'single_thread_rating', 'l1_instruction_cache',
       'l1_data_cache', 'l2_cache', 'l3_cache', 'eff_l1_instruction_cache',
       'eff_l1_data_cache', 'eff_l2_cache', 'integer_math',
       'floating_point_math', 'find_prime_numbers', 'random_string_sorting',
       'data_encryption', 'data_compression', 'physics',
       'extended_instructions', 'single_thread'],
      dtype='object')

Check the data types and non-null counts¶

InĀ [7]:
print(cpu_data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2348 entries, 0 to 2347
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      2348 non-null   object 
 1   performance_clockspeed    2338 non-null   object 
 2   performance_turbospeed    926 non-null    object 
 3   performance_cores         2259 non-null   float64
 4   performance_threads       2259 non-null   float64
 5   efficient_clockspeed      129 non-null    object 
 6   efficient_turbospeed      115 non-null    object 
 7   efficient_cores           163 non-null    float64
 8   efficient_threads         163 non-null    float64
 9   tdp                       1441 non-null   object 
 10  multithread_rating        2348 non-null   int64  
 11  single_thread_rating      2348 non-null   int64  
 12  l1_instruction_cache      1409 non-null   object 
 13  l1_data_cache             1407 non-null   object 
 14  l2_cache                  1405 non-null   object 
 15  l3_cache                  868 non-null    object 
 16  eff_l1_instruction_cache  103 non-null    object 
 17  eff_l1_data_cache         103 non-null    object 
 18  eff_l2_cache              93 non-null     object 
 19  integer_math              2149 non-null   float64
 20  floating_point_math       2149 non-null   float64
 21  find_prime_numbers        2012 non-null   float64
 22  random_string_sorting     2149 non-null   float64
 23  data_encryption           1155 non-null   float64
 24  data_compression          2149 non-null   float64
 25  physics                   2149 non-null   float64
 26  extended_instructions     2149 non-null   float64
 27  single_thread             2348 non-null   int64  
dtypes: float64(12), int64(3), object(13)
memory usage: 513.8+ KB
None

Look at descriptive statistics¶

InĀ [8]:
print(cpu_data.describe())
       performance_cores  performance_threads  efficient_cores  \
count        2259.000000          2259.000000       163.000000   
mean            4.544046             5.947764         6.791411   
std             2.801395             4.002537         2.879159   
min             1.000000             1.000000         2.000000   
25%             2.000000             4.000000         4.000000   
50%             4.000000             4.000000         8.000000   
75%             8.000000             8.000000         8.000000   
max            32.000000            32.000000        16.000000   

       efficient_threads  multithread_rating  single_thread_rating  \
count         163.000000         2348.000000           2348.000000   
mean            6.957055         5056.670358           1393.641823   
std             3.081906         7341.559596           1016.341297   
min             2.000000           93.000000             95.000000   
25%             4.000000          840.500000            568.000000   
50%             8.000000         2168.500000           1086.500000   
75%             8.000000         5709.250000           1951.250000   
max            16.000000        57389.000000           4786.000000   

        integer_math  floating_point_math  find_prime_numbers  \
count    2149.000000          2149.000000         2012.000000   
mean    21716.518381         11977.772918           24.540258   
std     25417.697425         18214.180379           48.549368   
min       122.000000           166.000000            1.000000   
25%      5139.000000          1985.000000            5.000000   
50%     13523.000000          4760.000000           10.000000   
75%     25358.000000         12910.000000           23.250000   
max    209791.000000        131787.000000          619.000000   

       random_string_sorting  data_encryption  data_compression      physics  \
count            2149.000000      1155.000000       2149.000000  2149.000000   
mean             9679.062355      6004.123810      73248.829688   367.891112   
std             10193.531835      6337.221465      90040.342566   519.354517   
min               294.000000      1025.000000       2023.000000    14.000000   
25%              2917.000000      1869.000000      18278.000000    93.000000   
50%              5869.000000      3258.000000      38372.000000   184.000000   
75%             12586.000000      7656.500000      90448.000000   415.000000   
max             81685.000000     43769.000000     719086.000000  6476.000000   

       extended_instructions  single_thread  
count            2149.000000    2348.000000  
mean             3776.795254    1393.641823  
std              6023.329546    1016.341297  
min                25.000000      95.000000  
25%               537.000000     568.000000  
50%              1354.000000    1086.500000  
75%              3514.000000    1951.250000  
max             52490.000000    4786.000000  

Feature Analysis¶

Overall Performance Ratings¶

Features:

  • multithread_rating, single_thread_rating
Distribution of ratings¶
InĀ [9]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot single_thread_rating distribution
sns.histplot(cpu_data['single_thread_rating'], ax=axes[0], color='blue', kde=True)
axes[0].set_title("Single Thread Rating Distribution")
axes[0].set_xlabel('Single Thread Rating')
axes[0].set_ylabel('Frequency')

# Plot multithread_rating distribution
sns.histplot(cpu_data['multithread_rating'], ax=axes[1], color='green', kde=True)
axes[1].set_title("Multithread Rating Distribution")
axes[1].set_xlabel('Multithread Rating')
axes[1].set_ylabel('Frequency')

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()
No description has been provided for this image
InĀ [10]:
# Generate statistics for single_thread_rating
single_thread_stats = cpu_data['single_thread_rating'].describe()
print("Single Thread Rating Statistics:")
print(single_thread_stats)

# Generate statistics for multithread_rating
multithread_stats = cpu_data['multithread_rating'].describe()
print("\nMultithread Rating Statistics:")
print(multithread_stats)
Single Thread Rating Statistics:
count    2348.000000
mean     1393.641823
std      1016.341297
min        95.000000
25%       568.000000
50%      1086.500000
75%      1951.250000
max      4786.000000
Name: single_thread_rating, dtype: float64

Multithread Rating Statistics:
count     2348.000000
mean      5056.670358
std       7341.559596
min         93.000000
25%        840.500000
50%       2168.500000
75%       5709.250000
max      57389.000000
Name: multithread_rating, dtype: float64
Single vs Multithreaded¶
InĀ [11]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create the scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=cpu_data, x='single_thread_rating', y='multithread_rating', alpha=0.7)

# Add titles and labels
plt.title("Single Thread Rating vs Multithread Rating", fontsize=16)
plt.xlabel("Single Thread Rating", fontsize=14)
plt.ylabel("Multithread Rating", fontsize=14)
plt.grid(True)

# Show the plot
plt.show()

# Calculate and print the correlation
correlation = cpu_data['single_thread_rating'].corr(cpu_data['multithread_rating'])
print(f"The correlation between single_thread_rating and multithread_rating is: {correlation:.2f}")
No description has been provided for this image
The correlation between single_thread_rating and multithread_rating is: 0.88

Clockspeed metrics¶

Features:

  • performance_clockspeed, performance_turbospeed
  • efficient_clockspeed, efficient_turbospeed
Distribution¶
InĀ [12]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot performance clockspeed
sns.kdeplot(cpu_data['performance_clockspeed'].dropna(), ax=axes[0, 0], color='blue', fill=True)
axes[0, 0].set_title("Performance Cores' Clockspeed Distribution")
axes[0, 0].set_xlabel('Clockspeed (GHz)')
axes[0, 0].set_ylabel('Density')

# Plot performance turbospeed
sns.kdeplot(cpu_data['performance_turbospeed'].dropna(), ax=axes[0, 1], color='green', fill=True)
axes[0, 1].set_title("Performance Cores' Turbospeed Distribution")
axes[0, 1].set_xlabel('Turbospeed (GHz)')
axes[0, 1].set_ylabel('Density')

# Plot efficient clockspeed
sns.kdeplot(cpu_data['efficient_clockspeed'].dropna(), ax=axes[1, 0], color='red', fill=True)
axes[1, 0].set_title("Efficient Cores' Clockspeed Distribution")
axes[1, 0].set_xlabel('Clockspeed (GHz)')
axes[1, 0].set_ylabel('Density')

# Plot efficient turbospeed
sns.kdeplot(cpu_data['efficient_turbospeed'].dropna(), ax=axes[1, 1], color='purple', fill=True)
axes[1, 1].set_title("Efficient Cores' Turbospeed Distribution")
axes[1, 1].set_xlabel('Turbospeed (GHz)')
axes[1, 1].set_ylabel('Density')

# Determine common x and y limits for all plots
x_min = min(
    cpu_data['performance_clockspeed'].min(),
    cpu_data['performance_turbospeed'].min(),
    cpu_data['efficient_clockspeed'].min(),
    cpu_data['efficient_turbospeed'].min(),
)

x_max = max(
    cpu_data['performance_clockspeed'].max(),
    cpu_data['performance_turbospeed'].max(),
    cpu_data['efficient_clockspeed'].max(),
    cpu_data['efficient_turbospeed'].max(),
)

y_max = max(ax.get_ylim()[1] for ax in axes.flat)  # Find the maximum y limit among all plots

# Set common limits
for ax in axes.flat:
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(0, y_max)

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()
No description has been provided for this image
Correlation with Performance¶
InĀ [13]:
# Calculate correlations
correlation_performance_single = cpu_data['performance_clockspeed'].corr(cpu_data['single_thread_rating'])
correlation_performance_multi = cpu_data['performance_clockspeed'].corr(cpu_data['multithread_rating'])
correlation_efficient_single = cpu_data['efficient_clockspeed'].corr(cpu_data['single_thread_rating'])
correlation_efficient_multi = cpu_data['efficient_clockspeed'].corr(cpu_data['multithread_rating'])

# Print the results
print(f"Correlation between performance_clockspeed and single_thread_rating: {correlation_performance_single:.2f}")
print(f"Correlation between performance_clockspeed and multithread_rating: {correlation_performance_multi:.2f}")
print(f"Correlation between efficient_clockspeed and single_thread_rating: {correlation_efficient_single:.2f}")
print(f"Correlation between efficient_clockspeed and multithread_rating: {correlation_efficient_multi:.2f}")
Correlation between performance_clockspeed and single_thread_rating: 0.61
Correlation between performance_clockspeed and multithread_rating: 0.48
Correlation between efficient_clockspeed and single_thread_rating: 0.21
Correlation between efficient_clockspeed and multithread_rating: 0.14
InĀ [14]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Ensure columns are numeric
cpu_data_clone['performance_clockspeed'] = pd.to_numeric(cpu_data_clone['performance_clockspeed'], errors='coerce')
cpu_data_clone['efficient_clockspeed'] = pd.to_numeric(cpu_data_clone['efficient_clockspeed'], errors='coerce')
cpu_data_clone['single_thread_rating'] = pd.to_numeric(cpu_data_clone['single_thread_rating'], errors='coerce')
cpu_data_clone['multithread_rating'] = pd.to_numeric(cpu_data_clone['multithread_rating'], errors='coerce')

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot performance_clockspeed vs single_thread_rating with regression line
sns.regplot(data=cpu_data_clone, x='performance_clockspeed', y='single_thread_rating', ax=axes[0, 0], color='blue', scatter_kws={'s': 10})
axes[0, 0].set_title('Performance Clockspeed vs Single Thread Rating')
axes[0, 0].set_xlabel('Performance Clockspeed (GHz)')
axes[0, 0].set_ylabel('Single Thread Rating')

# Plot performance_clockspeed vs multithread_rating with regression line
sns.regplot(data=cpu_data_clone, x='performance_clockspeed', y='multithread_rating', ax=axes[0, 1], color='green', scatter_kws={'s': 10})
axes[0, 1].set_title('Performance Clockspeed vs Multithread Rating')
axes[0, 1].set_xlabel('Performance Clockspeed (GHz)')
axes[0, 1].set_ylabel('Multithread Rating')

# Plot efficient_clockspeed vs single_thread_rating with regression line
sns.regplot(data=cpu_data_clone, x='efficient_clockspeed', y='single_thread_rating', ax=axes[1, 0], color='red', scatter_kws={'s': 10})
axes[1, 0].set_title('Efficient Clockspeed vs Single Thread Rating')
axes[1, 0].set_xlabel('Efficient Clockspeed (GHz)')
axes[1, 0].set_ylabel('Single Thread Rating')

# Plot efficient_clockspeed vs multithread_rating with regression line
sns.regplot(data=cpu_data_clone, x='efficient_clockspeed', y='multithread_rating', ax=axes[1, 1], color='purple', scatter_kws={'s': 10})
axes[1, 1].set_title('Efficient Clockspeed vs Multithread Rating')
axes[1, 1].set_xlabel('Efficient Clockspeed (GHz)')
axes[1, 1].set_ylabel('Multithread Rating')

plt.tight_layout()
plt.show()
No description has been provided for this image
Boost impact¶
InĀ [15]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Convert columns to numeric, forcing errors to NaN
cpu_data_clone['performance_turbospeed'] = pd.to_numeric(cpu_data_clone['performance_turbospeed'], errors='coerce')
cpu_data_clone['performance_clockspeed'] = pd.to_numeric(cpu_data_clone['performance_clockspeed'], errors='coerce')
cpu_data_clone['efficient_turbospeed'] = pd.to_numeric(cpu_data_clone['efficient_turbospeed'], errors='coerce')
cpu_data_clone['efficient_clockspeed'] = pd.to_numeric(cpu_data_clone['efficient_clockspeed'], errors='coerce')

# Compute turbo boost margins
cpu_data_clone['performance_turbo_boost'] = cpu_data_clone['performance_turbospeed'] - cpu_data_clone['performance_clockspeed']
cpu_data_clone['efficient_turbo_boost'] = cpu_data_clone['efficient_turbospeed'] - cpu_data_clone['efficient_clockspeed']

# Analyze turbo boost impact on single_thread_rating and multithread_rating
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Performance turbo boost vs single_thread_rating
sns.regplot(data=cpu_data_clone, x='performance_turbo_boost', y='single_thread_rating', ax=axes[0, 0], color='blue')
axes[0, 0].set_title('Performance Turbo Boost vs Single Thread Rating')
axes[0, 0].set_xlabel('Performance Turbo Boost (GHz)')
axes[0, 0].set_ylabel('Single Thread Rating')

# Performance turbo boost vs multithread_rating
sns.regplot(data=cpu_data_clone, x='performance_turbo_boost', y='multithread_rating', ax=axes[0, 1], color='green')
axes[0, 1].set_title('Performance Turbo Boost vs Multithread Rating')
axes[0, 1].set_xlabel('Performance Turbo Boost (GHz)')
axes[0, 1].set_ylabel('Multithread Rating')

# Efficient turbo boost vs single_thread_rating
sns.regplot(data=cpu_data_clone, x='efficient_turbo_boost', y='single_thread_rating', ax=axes[1, 0], color='red')
axes[1, 0].set_title('Efficient Turbo Boost vs Single Thread Rating')
axes[1, 0].set_xlabel('Efficient Turbo Boost (GHz)')
axes[1, 0].set_ylabel('Single Thread Rating')

# Efficient turbo boost vs multithread_rating
sns.regplot(data=cpu_data_clone, x='efficient_turbo_boost', y='multithread_rating', ax=axes[1, 1], color='purple')
axes[1, 1].set_title('Efficient Turbo Boost vs Multithread Rating')
axes[1, 1].set_xlabel('Efficient Turbo Boost (GHz)')
axes[1, 1].set_ylabel('Multithread Rating')

plt.tight_layout()
plt.show()
No description has been provided for this image

Core & Thread Analysis¶

Features:

  • performance_cores, performance_threads
  • efficient_cores, efficient_threads
Distribution¶
InĀ [16]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot performance cores
sns.histplot(cpu_data['performance_cores'].dropna(), ax=axes[0, 0], color='blue', kde=True)
axes[0, 0].set_title("Performance Cores Distribution")
axes[0, 0].set_xlabel('Number of Cores')
axes[0, 0].set_ylabel('Frequency')

# Plot performance threads
sns.histplot(cpu_data['performance_threads'].dropna(), ax=axes[0, 1], color='green', kde=True)
axes[0, 1].set_title("Performance Threads Distribution")
axes[0, 1].set_xlabel('Number of Threads')
axes[0, 1].set_ylabel('Frequency')

# Plot efficient cores
sns.histplot(cpu_data['efficient_cores'].dropna(), ax=axes[1, 0], color='red', kde=True)
axes[1, 0].set_title("Efficient Cores Distribution")
axes[1, 0].set_xlabel('Number of Cores')
axes[1, 0].set_ylabel('Frequency')

# Plot efficient threads
sns.histplot(cpu_data['efficient_threads'].dropna(), ax=axes[1, 1], color='purple', kde=True)
axes[1, 1].set_title("Efficient Threads Distribution")
axes[1, 1].set_xlabel('Number of Threads')
axes[1, 1].set_ylabel('Frequency')

# Determine common x and y limits for all plots
x_min = min(
    cpu_data['performance_cores'].min(),
    cpu_data['performance_threads'].min(),
    cpu_data['efficient_cores'].min(),
    cpu_data['efficient_threads'].min(),
)

x_max = max(
    cpu_data['performance_cores'].max(),
    cpu_data['performance_threads'].max(),
    cpu_data['efficient_cores'].max(),
    cpu_data['efficient_threads'].max(),
)

y_max = max(ax.get_ylim()[1] for ax in axes.flat)  # Get the maximum y-limit among all plots

# Set common x and y limits for all subplots
for ax in axes.flat:
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(0, y_max)

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()
No description has been provided for this image
InĀ [17]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Calculate core/thread ratio for performance and efficient cores
cpu_data_clone['performance_core_thread_ratio'] = cpu_data_clone['performance_cores'] / cpu_data_clone['performance_threads']
cpu_data_clone['efficient_core_thread_ratio'] = cpu_data_clone['efficient_cores'] / cpu_data_clone['efficient_threads']

# Calculate frequency counts for each ratio
performance_ratio_counts = cpu_data_clone['performance_core_thread_ratio'].value_counts().sort_index()
efficient_ratio_counts = cpu_data_clone['efficient_core_thread_ratio'].value_counts().sort_index()

# Print the frequency of core/thread ratios
print("Performance Core/Thread Ratio Frequencies:")
print(performance_ratio_counts)

print("\nEfficient Core/Thread Ratio Frequencies:")
print(efficient_ratio_counts)
Performance Core/Thread Ratio Frequencies:
performance_core_thread_ratio
0.5     774
1.0    1485
Name: count, dtype: int64

Efficient Core/Thread Ratio Frequencies:
efficient_core_thread_ratio
0.5      4
1.0    159
Name: count, dtype: int64
Multi-threading impact¶
InĀ [18]:
# Calculate correlations
correlation_performance_cores = cpu_data['performance_cores'].corr(cpu_data['multithread_rating'])
correlation_performance_threads = cpu_data['performance_threads'].corr(cpu_data['multithread_rating'])
correlation_efficient_cores = cpu_data['efficient_cores'].corr(cpu_data['multithread_rating'])
correlation_efficient_threads = cpu_data['efficient_threads'].corr(cpu_data['multithread_rating'])

# Print the results
print(f"Correlation between performance_cores and multithread_rating: {correlation_performance_cores:.2f}")
print(f"Correlation between performance_threads and multithread_rating: {correlation_performance_threads:.2f}")
print(f"Correlation between efficient_cores and multithread_rating: {correlation_efficient_cores:.2f}")
print(f"Correlation between efficient_threads and multithread_rating: {correlation_efficient_threads:.2f}")
Correlation between performance_cores and multithread_rating: 0.41
Correlation between performance_threads and multithread_rating: 0.74
Correlation between efficient_cores and multithread_rating: 0.47
Correlation between efficient_threads and multithread_rating: 0.49
InĀ [19]:
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot performance_cores vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='performance_cores', y='multithread_rating', ax=axes[0, 0], color='blue', scatter_kws={'s': 10})
axes[0, 0].set_title('Performance Cores vs Multithread Rating')
axes[0, 0].set_xlabel('Performance Cores')
axes[0, 0].set_ylabel('Multithread Rating')

# Plot performance_threads vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='performance_threads', y='multithread_rating', ax=axes[0, 1], color='green', scatter_kws={'s': 10})
axes[0, 1].set_title('Performance Threads vs Multithread Rating')
axes[0, 1].set_xlabel('Performance Threads')
axes[0, 1].set_ylabel('Multithread Rating')

# Plot efficient_cores vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='efficient_cores', y='multithread_rating', ax=axes[1, 0], color='red', scatter_kws={'s': 10})
axes[1, 0].set_title('Efficient Cores vs Multithread Rating')
axes[1, 0].set_xlabel('Efficient Cores')
axes[1, 0].set_ylabel('Multithread Rating')

# Plot efficient_threads vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='efficient_threads', y='multithread_rating', ax=axes[1, 1], color='purple', scatter_kws={'s': 10})
axes[1, 1].set_title('Efficient Threads vs Multithread Rating')
axes[1, 1].set_xlabel('Efficient Threads')
axes[1, 1].set_ylabel('Multithread Rating')

plt.tight_layout()
plt.show()
No description has been provided for this image

Power Consumption (TDP)¶

Features:

  • TDP
TDP vs Performance¶
InĀ [20]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Ensure 'tdp' column is numeric
cpu_data_clone['tdp'] = pd.to_numeric(cpu_data_clone['tdp'], errors='coerce')

# Calculate correlations
correlation_tdp_single = cpu_data_clone['tdp'].corr(cpu_data_clone['single_thread_rating'])
correlation_tdp_multi = cpu_data_clone['tdp'].corr(cpu_data_clone['multithread_rating'])

# Print the results
print(f"Correlation between TDP and single_thread_rating: {correlation_tdp_single:.2f}")
print(f"Correlation between TDP and multithread_rating: {correlation_tdp_multi:.2f}")
Correlation between TDP and single_thread_rating: 0.39
Correlation between TDP and multithread_rating: 0.43
InĀ [21]:
# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot TDP vs single_thread_rating with regression line
sns.regplot(data=cpu_data_clone, x='tdp', y='single_thread_rating', ax=axes[0], color='blue', scatter_kws={'s': 10})
axes[0].set_title('TDP vs Single Thread Rating')
axes[0].set_xlabel('TDP (W)')
axes[0].set_ylabel('Single Thread Rating')

# Plot TDP vs multithread_rating with regression line
sns.regplot(data=cpu_data_clone, x='tdp', y='multithread_rating', ax=axes[1], color='green', scatter_kws={'s': 10})
axes[1].set_title('TDP vs Multithread Rating')
axes[1].set_xlabel('TDP (W)')
axes[1].set_ylabel('Multithread Rating')

plt.tight_layout()
plt.show()
No description has been provided for this image
Efficiency Analysis¶
InĀ [22]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Ensure 'tdp' column is numeric
cpu_data_clone['tdp'] = pd.to_numeric(cpu_data_clone['tdp'], errors='coerce')

# Calculate performance efficiency
cpu_data_clone['performance_efficiency'] = cpu_data_clone['multithread_rating'] / cpu_data_clone['tdp']

# Drop rows with NaN values in 'performance_efficiency'
cpu_data_clone = cpu_data_clone.dropna(subset=['performance_efficiency'])

# Filter out rows where 'performance_efficiency' is less than or equal to 0
cpu_data_clone = cpu_data_clone[cpu_data_clone['performance_efficiency'] > 0]

# Sort the DataFrame by 'performance_efficiency'
cpu_data_clone = cpu_data_clone.sort_values(by='performance_efficiency', ascending=False)

# Display the top 5 rows of the updated DataFrame
print("Top 5 rows:")
print(cpu_data_clone[['name', 'multithread_rating', 'tdp', 'performance_efficiency']].head())

# Display the bottom 5 rows of the updated DataFrame
print("\nBottom 5 rows:")
print(cpu_data_clone[['name', 'multithread_rating', 'tdp', 'performance_efficiency']].tail())

# Plot the distribution of performance efficiency
plt.figure(figsize=(10, 6))
sns.histplot(cpu_data_clone['performance_efficiency'], kde=True, color="blue", bins=30)
plt.title("Performance Efficiency Distribution", fontsize=16)
plt.xlabel("Performance Efficiency (multithread_rating / tdp)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()
Top 5 rows:
                        name  multithread_rating   tdp  performance_efficiency
117  intel core ultra 7 164u               15187   9.0             1687.444444
470     amd ryzen z1 extreme               25182  15.0             1678.800000
82             apple a18 pro               13063   8.0             1632.875000
91       intel core i7 1260u               14001   9.0             1555.666667
52       intel core i7 1250u               11673   9.0             1297.000000

Bottom 5 rows:
                             name  multithread_rating   tdp  \
735  mobile amd athlon xp-m 1800+                 193  45.0   
980    mobile amd athlon 64 3400+                 333  81.5   
974    mobile amd athlon 64 3200+                 326  81.5   
671            intel celeron b710                 106  35.0   
681  mobile intel celeron 1.80ghz                 121  66.1   

     performance_efficiency  
735                4.288889  
980                4.085890  
974                4.000000  
671                3.028571  
681                1.830560  
No description has been provided for this image

GPU Dataframe¶

Preview the data¶

Dataframe head¶

InĀ [23]:
# Display the first few rows
print(gpu_data.head())
                      name  avg_g3d_mark bus_interface  max_memory_size  \
0          radeon rx 6600m         13814   pcie 4.0 x8           8192.0   
1      radeont rx 6850m xt         13848  pcie 4.0 x16          12288.0   
2  rtx 1000 ada generation         14043          None              NaN   
3           rtx a3000 12gb         14088          None              NaN   
4         geforce rtx 4050         14433  pcie 4.0 x16           6144.0   

   core_clock max_direct open_gl  max_tdp  test_directx_9  test_directx_10  \
0      2068.0       12_2     4.6    100.0           180.0             89.0   
1      2321.0       12_2     4.6    165.0           144.0            106.0   
2         NaN       None    None      NaN           179.0             74.0   
3         NaN       None    None      NaN           169.0             88.0   
4      1605.0       12_2     4.6    115.0           186.0             81.0   

   test_directx_11  test_directx_12  test_gpu_compute  
0            135.0             52.0            5752.0  
1            166.0             59.0            5210.0  
2            115.0             65.0            5471.0  
3            115.0             65.0            5593.0  
4            131.0             61.0            5943.0  

Dataframe tail¶

InĀ [24]:
# Display the last few rows
print(gpu_data.tail())
                        name  avg_g3d_mark bus_interface  max_memory_size  \
613          radeon rx 7900m         22752          None              NaN   
614  rtx 4000 ada generation         22962          None              NaN   
615  rtx 5000 ada generation         24006          None              NaN   
616         geforce rtx 4080         25076  pcie 4.0 x16          12288.0   
617         geforce rtx 4090         27754  pcie 4.0 x16          16384.0   

     core_clock max_direct open_gl  max_tdp  test_directx_9  test_directx_10  \
613         NaN       None    None      NaN           267.0            127.0   
614         NaN       None    None      NaN           271.0            140.0   
615         NaN       None    None      NaN           272.0            153.0   
616      1860.0       12_2     4.6    150.0           286.0            161.0   
617      1455.0       12_2     4.6    150.0           315.0            181.0   

     test_directx_11  test_directx_12  test_gpu_compute  
613            256.0             93.0            9297.0  
614            224.0            100.0            9232.0  
615            239.0            102.0            9553.0  
616            248.0             96.0           11422.0  
617            270.0            107.0           12650.0  

Check all the features¶

InĀ [25]:
print(gpu_data.columns)
Index(['name', 'avg_g3d_mark', 'bus_interface', 'max_memory_size',
       'core_clock', 'max_direct', 'open_gl', 'max_tdp', 'test_directx_9',
       'test_directx_10', 'test_directx_11', 'test_directx_12',
       'test_gpu_compute'],
      dtype='object')

Check the data types and non-null counts¶

InĀ [26]:
print(gpu_data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              618 non-null    object 
 1   avg_g3d_mark      618 non-null    int64  
 2   bus_interface     349 non-null    object 
 3   max_memory_size   342 non-null    float64
 4   core_clock        309 non-null    float64
 5   max_direct        353 non-null    object 
 6   open_gl           346 non-null    object 
 7   max_tdp           245 non-null    float64
 8   test_directx_9    340 non-null    float64
 9   test_directx_10   340 non-null    float64
 10  test_directx_11   340 non-null    float64
 11  test_directx_12   340 non-null    float64
 12  test_gpu_compute  340 non-null    float64
dtypes: float64(8), int64(1), object(4)
memory usage: 62.9+ KB
None

Look at descriptive statistics¶

InĀ [27]:
print(gpu_data.describe())
       avg_g3d_mark  max_memory_size   core_clock     max_tdp  test_directx_9  \
count    618.000000       342.000000   309.000000  245.000000      340.000000   
mean    2784.377023      2852.590643   756.132686   58.142857       64.752941   
std     4605.472224      3298.820120   366.364012   38.361524       67.330367   
min        2.000000         2.000000   143.000000    7.000000        1.000000   
25%      358.000000       512.000000   500.000000   25.000000       11.000000   
50%      671.500000      2048.000000   660.000000   50.000000       36.000000   
75%     2697.000000      4096.000000   954.000000   80.000000      107.250000   
max    27754.000000     16384.000000  2321.000000  165.000000      315.000000   

       test_directx_10  test_directx_11  test_directx_12  test_gpu_compute  
count       340.000000       340.000000       340.000000        340.000000  
mean         26.597059        38.252941        19.311765       1892.626471  
std          37.149137        51.488414        24.423437       2288.641490  
min           0.000000         0.000000         0.000000          0.000000  
25%           2.000000         4.000000         0.000000        239.500000  
50%           7.000000        15.000000         7.500000        806.000000  
75%          35.000000        54.000000        31.000000       2865.000000  
max         181.000000       270.000000       107.000000      12650.000000  

Feature Analysis¶

Clock Speed Analysis¶

Features:

  • core_clock
Distribution¶
InĀ [28]:
# Plot the distribution of core_clock
plt.figure(figsize=(10, 6))
sns.histplot(gpu_data['core_clock'].dropna(), kde=True, color='blue', bins=30)

# Add labels and title
plt.title("Distribution of GPU Core Clock Speeds", fontsize=16)
plt.xlabel("Core Clock (MHz)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image
Impact on Performance¶
InĀ [29]:
# Calculate correlation coefficients
correlation_core_clock_avg_g3d_mark = gpu_data['core_clock'].corr(gpu_data['avg_g3d_mark'])
correlation_core_clock_test_directx_9 = gpu_data['core_clock'].corr(gpu_data['test_directx_9'])
correlation_core_clock_test_directx_10 = gpu_data['core_clock'].corr(gpu_data['test_directx_10'])
correlation_core_clock_test_directx_11 = gpu_data['core_clock'].corr(gpu_data['test_directx_11'])
correlation_core_clock_test_directx_12 = gpu_data['core_clock'].corr(gpu_data['test_directx_12'])
correlation_core_clock_test_gpu_compute = gpu_data['core_clock'].corr(gpu_data['test_gpu_compute'])

# Print correlation coefficients
print(f"Correlation between core_clock and avg_g3d_mark: {correlation_core_clock_avg_g3d_mark:.2f}")
print(f"Correlation between core_clock and test_directx_9: {correlation_core_clock_test_directx_9:.2f}")
print(f"Correlation between core_clock and test_directx_10: {correlation_core_clock_test_directx_10:.2f}")
print(f"Correlation between core_clock and test_directx_11: {correlation_core_clock_test_directx_11:.2f}")
print(f"Correlation between core_clock and test_directx_12: {correlation_core_clock_test_directx_12:.2f}")
print(f"Correlation between core_clock and test_gpu_compute: {correlation_core_clock_test_gpu_compute:.2f}")

# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(3, 2, figsize=(14, 18))

# Plot core_clock vs avg_g3d_mark
sns.regplot(data=gpu_data, x='core_clock', y='avg_g3d_mark', ax=axes[0, 0], color='blue', scatter_kws={'s': 10})
axes[0, 0].set_title(f"Core Clock vs Avg G3D Mark (Correlation: {correlation_core_clock_avg_g3d_mark:.2f})")

# Plot core_clock vs test_directx_9
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_9', ax=axes[0, 1], color='green', scatter_kws={'s': 10})
axes[0, 1].set_title(f"Core Clock vs Test DirectX 9 (Correlation: {correlation_core_clock_test_directx_9:.2f})")

# Plot core_clock vs test_directx_10
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_10', ax=axes[1, 0], color='red', scatter_kws={'s': 10})
axes[1, 0].set_title(f"Core Clock vs Test DirectX 10 (Correlation: {correlation_core_clock_test_directx_10:.2f})")

# Plot core_clock vs test_directx_11
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_11', ax=axes[1, 1], color='purple', scatter_kws={'s': 10})
axes[1, 1].set_title(f"Core Clock vs Test DirectX 11 (Correlation: {correlation_core_clock_test_directx_11:.2f})")

# Plot core_clock vs test_directx_12
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_12', ax=axes[2, 0], color='orange', scatter_kws={'s': 10})
axes[2, 0].set_title(f"Core Clock vs Test DirectX 12 (Correlation: {correlation_core_clock_test_directx_12:.2f})")

# Plot core_clock vs test_gpu_compute
sns.regplot(data=gpu_data, x='core_clock', y='test_gpu_compute', ax=axes[2, 1], color='brown', scatter_kws={'s': 10})
axes[2, 1].set_title(f"Core Clock vs Test GPU Compute (Correlation: {correlation_core_clock_test_gpu_compute:.2f})")

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()
Correlation between core_clock and avg_g3d_mark: 0.71
Correlation between core_clock and test_directx_9: 0.70
Correlation between core_clock and test_directx_10: 0.63
Correlation between core_clock and test_directx_11: 0.68
Correlation between core_clock and test_directx_12: 0.70
Correlation between core_clock and test_gpu_compute: 0.68
No description has been provided for this image

Memory and Bandwidth Analysis¶

Features:

  • max_memory_size
  • bus_interface
Memory Size¶
InĀ [30]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Define the memory size categories with handling for NaN values
def categorize_memory_size(memory_size):
    if pd.isna(memory_size):  # Check if the value is NaN
        return 'Unknown'
    elif memory_size <= 2048:
        return '<2GB'
    elif 2048 < memory_size <= 4096:
        return '2–4GB'
    elif 4096 < memory_size <= 8192:
        return '4–8GB'
    elif 8192 < memory_size <= 16384:
        return '8–16GB'
    else:
        return '>16GB'

# Apply the categorization function to the 'max_memory_size' column
gpu_data_clone['memory_size_category'] = gpu_data_clone['max_memory_size'].apply(categorize_memory_size)

# Group by the memory size category and calculate the average avg_g3d_mark
memory_size_comparison = gpu_data_clone.groupby('memory_size_category')['avg_g3d_mark'].mean()

# Exclude the 'Unknown' category from the comparison
memory_size_comparison = memory_size_comparison[memory_size_comparison.index != 'Unknown']

# Check the unique categories in the memory_size_comparison DataFrame
print("Unique categories in memory_size_comparison:", memory_size_comparison.index)

# Define the custom order of memory size categories
category_order = ['<2GB', '2–4GB', '4–8GB', '8–16GB', '>16GB']

# Ensure that the order only includes categories that are present in the data
category_order = [category for category in category_order if category in memory_size_comparison.index]

# Sort the memory_size_comparison based on the custom order
memory_size_comparison = memory_size_comparison[category_order]

# Print the results
print(memory_size_comparison)

# Plot the comparison
plt.figure(figsize=(10, 6))
memory_size_comparison.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Average G3D Mark by GPU Memory Size Category", fontsize=16)
plt.xlabel("Memory Size Category", fontsize=14)
plt.ylabel("Average G3D Mark", fontsize=14)
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Show the plot
plt.show()
Unique categories in memory_size_comparison: Index(['2–4GB', '4–8GB', '8–16GB', '<2GB'], dtype='object', name='memory_size_category')
memory_size_category
<2GB        579.118182
2–4GB      3846.544118
4–8GB     11477.357143
8–16GB    16148.416667
Name: avg_g3d_mark, dtype: float64
No description has been provided for this image
Bus Interface¶
InĀ [31]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Filter out rows with missing bus_interface or avg_g3d_mark
filtered_gpu_data_clone = gpu_data_clone.dropna(subset=['bus_interface', 'avg_g3d_mark'])

# Group by bus_interface and calculate the average avg_g3d_mark
bus_interface_performance = filtered_gpu_data_clone.groupby('bus_interface')['avg_g3d_mark'].mean().sort_values()

# Plot the results
plt.figure(figsize=(12, 6))
sns.barplot(y=bus_interface_performance.index, x=bus_interface_performance.values, palette="viridis", orient='h')
plt.title("Impact of Bus Interface on GPU Performance (avg_g3d_mark)", fontsize=16)
plt.xlabel("Average G3D Mark", fontsize=14)
plt.ylabel("Bus Interface", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image

Power Consumption (TDP)¶

Features:

  • max_tdp
Performance vs Power¶
InĀ [32]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Ensure 'max_tdp' column is numeric
gpu_data_clone['max_tdp'] = pd.to_numeric(gpu_data_clone['max_tdp'], errors='coerce')

# Calculate correlation
correlation_tdp_g3d = gpu_data_clone['max_tdp'].corr(gpu_data_clone['avg_g3d_mark'])

# Print the correlation result
print(f"Correlation between max_tdp and avg_g3d_mark: {correlation_tdp_g3d:.2f}")

# Plot the relationship
plt.figure(figsize=(10, 6))
sns.regplot(data=gpu_data_clone, x='max_tdp', y='avg_g3d_mark', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Max TDP vs Avg G3D Mark", fontsize=16)
plt.xlabel("Max TDP (W)", fontsize=14)
plt.ylabel("Avg G3D Mark", fontsize=14)
plt.grid(True)

# Show the plot
plt.show()
Correlation between max_tdp and avg_g3d_mark: 0.75
No description has been provided for this image
Efficiency¶
InĀ [33]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Ensure 'avg_g3d_mark' and 'max_tdp' columns are numeric
gpu_data_clone['avg_g3d_mark'] = pd.to_numeric(gpu_data_clone['avg_g3d_mark'], errors='coerce')
gpu_data_clone['max_tdp'] = pd.to_numeric(gpu_data_clone['max_tdp'], errors='coerce')

# Exclude rows where 'max_tdp' is NaN
gpu_data_clone = gpu_data_clone.dropna(subset=['max_tdp'])

# Compute performance efficiency
gpu_data_clone['efficiency'] = gpu_data_clone['avg_g3d_mark'] / gpu_data_clone['max_tdp']

# Sort the DataFrame by 'efficiency'
gpu_data_sorted = gpu_data_clone.sort_values(by='efficiency', ascending=False)

# Display the top 5 rows of the sorted DataFrame
print("Top 5 GPUs by Efficiency:")
print(gpu_data_sorted[['name', 'avg_g3d_mark', 'max_tdp', 'efficiency']].head())

# Display the bottom 5 rows of the sorted DataFrame
print("\nBottom 5 GPUs by Efficiency:")
print(gpu_data_sorted[['name', 'avg_g3d_mark', 'max_tdp', 'efficiency']].tail())
Top 5 GPUs by Efficiency:
                 name  avg_g3d_mark  max_tdp  efficiency
510  radeon pro w6300          5560     25.0  222.400000
591   radeon rx 7600s         14732     75.0  196.426667
593   radeon rx 6700s         14974     80.0  187.175000
617  geforce rtx 4090         27754    150.0  185.026667
556  radeon pro 5600m          9233     50.0  184.660000

Bottom 5 GPUs by Efficiency:
                    name  avg_g3d_mark  max_tdp  efficiency
103       radeon hd 6320           147     45.0    3.266667
121  geforce go 7800 gtx           210     65.0    3.230769
84        radeon hd 6310           122     45.0    2.711111
63        radeon hd 6250            94     35.0    2.685714
70        radeon hd 6290           105     45.0    2.333333

Overall Performance Ratings¶

Features:

  • avg_g3d_mark (3DMark score)
  • test_gpu_compute (compute performance)
Distribution of ratings¶
InĀ [34]:
# Plot the distribution of avg_g3d_mark
plt.figure(figsize=(12, 6))
sns.histplot(gpu_data['avg_g3d_mark'].dropna(), kde=True, color='blue', bins=30)
plt.title("Distribution of Avg G3D Mark", fontsize=16)
plt.xlabel("Avg G3D Mark", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()

# Plot the distribution of test_gpu_compute
plt.figure(figsize=(12, 6))
sns.histplot(gpu_data['test_gpu_compute'].dropna(), kde=True, color='green', bins=30)
plt.title("Distribution of Test GPU Compute", fontsize=16)
plt.xlabel("Test GPU Compute", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
Compute vs Gaming¶
InĀ [35]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Create performance categories based on avg_g3d_mark
bins = [0, 2000, 4000, 6000, 8000, 10000]
labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
gpu_data_clone['performance_category'] = pd.cut(gpu_data_clone['avg_g3d_mark'], bins=bins, labels=labels)

# Calculate correlation
correlation_gaming_compute = gpu_data_clone['avg_g3d_mark'].corr(gpu_data_clone['test_gpu_compute'])

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=gpu_data_clone, x='avg_g3d_mark', y='test_gpu_compute', hue='performance_category', alpha=0.7)

# Add titles and labels
plt.title(f"Avg G3D Mark vs Test GPU Compute (Correlation: {correlation_gaming_compute:.2f})", fontsize=16)
plt.xlabel("Avg G3D Mark (Gaming Performance)", fontsize=14)
plt.ylabel("Test GPU Compute (Compute Performance)", fontsize=14)
plt.grid(True)

# Show plot
plt.show()

# Print correlation
print(f"The correlation between avg_g3d_mark and test_gpu_compute is: {correlation_gaming_compute:.2f}")
No description has been provided for this image
The correlation between avg_g3d_mark and test_gpu_compute is: 0.99

Full Laptop Dataframe¶

Source (Laptop Shop)¶

Analyzing number of laptops from each source¶

InĀ [36]:
# Get the unique values and their counts
source_counts = full_relation['laptop_specs_source'].value_counts()

# Plot the unique values and their counts
plt.figure(figsize=(10, 6))
ax = sns.barplot(y=source_counts.index, x=source_counts.values, palette="viridis")
plt.title("Number of laptops per shop", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Source", fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
No description has been provided for this image

Analysising price grouped by source¶

InĀ [37]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by brand/source
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, y='laptop_specs_source', x='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Source", fontsize=16)
plt.ylabel("Source", fontsize=14)
plt.xlabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [38]:
# Group by 'laptop_specs_source' and calculate descriptive statistics for 'laptop_specs_price'
price_stats_by_source = full_relation.groupby('laptop_specs_source')['laptop_specs_price'].describe()

# Print the statistics
print(price_stats_by_source)
                     count          mean           std         min  \
laptop_specs_source                                                  
cellphones           264.0  2.940125e+07  2.012227e+07   9490000.0   
fptshop              211.0  2.747815e+07  1.818169e+07   9490000.0   
gearvn               148.0  2.554818e+07  1.225296e+07  11990000.0   
hacom                482.0  2.435385e+07  1.207721e+07   8799000.0   
laptopaz             198.0  2.660227e+07  1.243102e+07  11990000.0   
laptopworld           77.0  3.024714e+07  1.227698e+07  16290000.0   
nguyenkim             55.0  1.755909e+07  5.782377e+06   9790000.0   
phongvu              296.0  2.511128e+07  1.133528e+07   9490000.0   
thegioididong        228.0  1.986680e+07  6.136243e+06   7890000.0   

                            25%         50%         75%          max  
laptop_specs_source                                                   
cellphones           17140000.0  23840000.0  34990000.0  182490000.0  
fptshop              16490000.0  21990000.0  31440000.0  128990000.0  
gearvn               18265000.0  22140000.0  25840000.0   89990000.0  
hacom                16799000.0  21199000.0  29374000.0   95699000.0  
laptopaz             17990000.0  23990000.0  30490000.0   85000000.0  
laptopworld          21990000.0  27490000.0  34390000.0   88490000.0  
nguyenkim            13640000.0  16790000.0  20990000.0   32990000.0  
phongvu              17990000.0  21990000.0  27990000.0   83990000.0  
thegioididong        16390000.0  18990000.0  22490000.0   70690000.0  

Brand¶

Analysing number of laptops from each brand¶

InĀ [39]:
# Get the unique values and their counts
brand_counts = full_relation['laptop_specs_brand'].value_counts()

# Plot the unique values and their counts
plt.figure(figsize=(12, 8))
ax = sns.barplot(y=brand_counts.index, x=brand_counts.values, palette="viridis")
plt.title("Number of laptops per brand", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Brand", fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
No description has been provided for this image

Analysising price grouped by brand¶

InĀ [40]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by brand
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, y='laptop_specs_brand', x='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Brand", fontsize=16)
plt.ylabel("Brand", fontsize=14)
plt.xlabel("Price", fontsize=14)

plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Central Processing Unit (CPU)¶

Basic analysis¶

InĀ [41]:
# Group by 'laptop_specs_cpu' and calculate the mean price and count
mean_price_by_cpu = full_relation.groupby('laptop_specs_cpu')['laptop_specs_price'].agg(['mean', 'count'])
print("Number of unique CPUs:", mean_price_by_cpu.shape[0], end='\n\n')

# Sort the DataFrame by mean price
mean_price_by_cpu = mean_price_by_cpu.sort_values(by='mean', ascending=False)

# Format the mean price as currency
mean_price_by_cpu['mean'] = mean_price_by_cpu['mean'].apply(lambda x: f"{x:,.2f}đ")

# Display the results
print("Top 10 CPUs by Mean Price:")
print(mean_price_by_cpu.head(10), '\n\n')

print("Bottom 10 CPUs by Mean Price:")
print(mean_price_by_cpu.tail(10), '\n\n')


# Sort the DataFrame by count
mean_price_by_cpu = mean_price_by_cpu.sort_values(by='count', ascending=False)

# Display the results
print("Top 10 CPUs by Count:")
print(mean_price_by_cpu.head(10), '\n\n')

print("Bottom 10 CPUs by Count:")
print(mean_price_by_cpu.tail(10), '\n\n')
Number of unique CPUs: 131

Top 10 CPUs by Mean Price:
                                  mean  count
laptop_specs_cpu                             
apple m3 max 16 core   138,740,000.00đ      2
apple m2 max 12 core   105,990,000.00đ      1
apple m4 max 16 core   102,490,000.00đ      2
intel core i9 13980hx   90,240,000.00đ      4
apple m4 max 14 core    86,656,666.67đ      3
intel core i9 13950hx   85,699,000.00đ      1
intel core i9 11900h    85,000,000.00đ      1
apple m3 max 14 core    82,490,000.00đ      5
intel core i7 13850hx   73,049,000.00đ      2
intel core i9 10885h    72,990,000.00đ      1 


Bottom 10 CPUs by Mean Price:
                               mean  count
laptop_specs_cpu                          
amd ryzen 5 5500u    12,994,500.00đ      2
intel core 3 100u    12,990,000.00đ      1
intel core i3 1220p  12,490,000.00đ      1
amd ryzen 7 5700u    12,415,153.85đ     13
amd ryzen 5 7520u    12,104,454.55đ     22
intel core i3 1315u  11,889,387.10đ     31
intel core i3 8145u  11,640,000.00đ      2
intel core i3 1305u  11,531,272.73đ     11
intel core i3 1215u   9,968,217.39đ     23
intel celeron n4500   8,340,000.00đ      2 


Top 10 CPUs by Count:
                                   mean  count
laptop_specs_cpu                              
intel core i5 13420h     18,797,554.62đ    119
intel core ultra 7 155h  34,566,491.53đ    118
intel core i5 1335u      17,550,769.91đ    113
intel core i7 13620h     23,585,819.82đ    111
intel core i7 1355u      21,408,989.80đ     98
intel core i5 1235u      15,204,220.78đ     77
intel core ultra 5 125h  24,521,516.13đ     62
intel core i5 1334u      16,736,633.33đ     60
intel core i5 12450h     17,077,685.19đ     54
apple m2 8 core          30,680,660.00đ     50 


Bottom 10 CPUs by Count:
                                                    mean  count
laptop_specs_cpu                                               
intel core i5 1230u                       24,490,000.00đ      1
intel core i7 1250u                       24,999,000.00đ      1
amd ryzen 7 4800h                         17,890,000.00đ      1
amd ryzen 7 6800h                         17,690,000.00đ      1
intel core i5 1345u                       24,999,000.00đ      1
intel core ultra 7 256v                   26,690,000.00đ      1
qualcomm snapdragon x elite - x1e-78-100  31,190,000.00đ      1
intel core ultra 7 165u                   31,490,000.00đ      1
intel core i5 11320h                      15,990,000.00đ      1
amd ryzen 7 5800hs                        23,990,000.00đ      1 


Analyzing CPU performance relation with price¶

InĀ [42]:
# Calculate correlations
correlation_multithread_price = full_relation['cpu_specs_multithread_rating'].corr(full_relation['laptop_specs_price'])
correlation_single_thread_price = full_relation['cpu_specs_single_thread_rating'].corr(full_relation['laptop_specs_price'])

# Print the results
print(f"Correlation between multithread_rating and price: {correlation_multithread_price:.2f}")
print(f"Correlation between single_thread_rating and price: {correlation_single_thread_price:.2f}")
Correlation between multithread_rating and price: 0.57
Correlation between single_thread_rating and price: 0.50
InĀ [43]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot single_thread_rating vs price with regression line
sns.regplot(data=full_relation, x='cpu_specs_single_thread_rating', y='laptop_specs_price', ax=axes[0], color='blue', scatter_kws={'s': 10})
axes[0].set_title('Single Thread Rating vs Price')
axes[0].set_xlabel('Single Thread Rating')
axes[0].set_ylabel('Price')

# Plot multithread_rating vs price with regression line
sns.regplot(data=full_relation, x='cpu_specs_multithread_rating', y='laptop_specs_price', ax=axes[1], color='green', scatter_kws={'s': 10})
axes[1].set_title('Multithread Rating vs Price')
axes[1].set_xlabel('Multithread Rating')
axes[1].set_ylabel('Price')

plt.tight_layout()
plt.show()
No description has been provided for this image

Graphics Processing Unit (GPU)¶

Basic analysis¶

InĀ [44]:
# Group by 'laptop_specs_gpu' and calculate the mean price and count
mean_price_by_gpu = full_relation.groupby('laptop_specs_vga')['laptop_specs_price'].agg(['mean', 'count'])
print("Number of unique GPUs:", mean_price_by_gpu.shape[0], end='\n\n')

# Sort the DataFrame by mean price
mean_price_by_gpu = mean_price_by_gpu.sort_values(by='mean', ascending=False)

# Format the mean price as currency
mean_price_by_gpu['mean'] = mean_price_by_gpu['mean'].apply(lambda x: f"{x:,.2f}đ")

# Display the results
print("Top 10 GPUs by Mean Price:")
print(mean_price_by_gpu.head(10), '\n\n')

print("Bottom 10 GPUs by Mean Price:")
print(mean_price_by_gpu.tail(10), '\n\n')


# Sort the DataFrame by count
mean_price_by_gpu = mean_price_by_gpu.sort_values(by='count', ascending=False)

# Display the results
print("Top 10 GPUs by Count:")
print(mean_price_by_gpu.head(10), '\n\n')

print("Bottom 10 GPUs by Count:")
print(mean_price_by_gpu.tail(10), '\n\n')
Number of unique GPUs: 23

Top 10 GPUs by Mean Price:
                                   mean  count
laptop_specs_vga                              
geforce rtx 4090         93,490,000.00đ      4
geforce rtx 4080         76,677,500.00đ      8
rtx 2000 ada generation  75,153,571.43đ      7
geforce rtx 2060         55,990,000.00đ      1
rtx a1000                49,597,000.00đ      4
geforce rtx 4070         47,671,538.46đ     26
rtx a500                 47,532,333.33đ      3
geforce gtx 1650 ti      40,990,000.00đ      2
geforce rtx 3070 ti      37,490,000.00đ      1
geforce rtx 3060         35,521,900.00đ     10 


Bottom 10 GPUs by Mean Price:
                            mean  count
laptop_specs_vga                       
geforce mx570     25,099,000.00đ      2
radeon rx 7600s   23,490,000.00đ      1
geforce mx450     22,994,500.00đ      2
geforce rtx 3050  22,533,222.89đ    166
geforce mx550     20,602,454.55đ     11
geforce rtx 2050  18,259,426.23đ     61
geforce mx250     18,190,000.00đ      1
geforce mx350     17,990,000.00đ      1
geforce gtx 1650  17,623,333.33đ      3
radeon rx 6550m   15,540,000.00đ      2 


Top 10 GPUs by Count:
                                   mean  count
laptop_specs_vga                              
geforce rtx 3050         22,533,222.89đ    166
geforce rtx 4050         27,156,802.63đ    152
geforce rtx 4060         34,472,539.82đ    113
geforce rtx 2050         18,259,426.23đ     61
geforce rtx 4070         47,671,538.46đ     26
geforce mx550            20,602,454.55đ     11
geforce rtx 3050 ti      30,670,000.00đ     10
geforce rtx 3060         35,521,900.00đ     10
geforce rtx 4080         76,677,500.00đ      8
rtx 2000 ada generation  75,153,571.43đ      7 


Bottom 10 GPUs by Count:
                               mean  count
laptop_specs_vga                          
geforce gtx 1650     17,623,333.33đ      3
geforce gtx 1650 ti  40,990,000.00đ      2
geforce mx570        25,099,000.00đ      2
geforce mx450        22,994,500.00đ      2
radeon rx 6550m      15,540,000.00đ      2
geforce rtx 3070 ti  37,490,000.00đ      1
radeon rx 7600s      23,490,000.00đ      1
geforce rtx 2060     55,990,000.00đ      1
geforce mx250        18,190,000.00đ      1
geforce mx350        17,990,000.00đ      1 


Analyzing GPU performance relation with price¶

InĀ [45]:
# Calculate the correlation between avg_g3d_mark and price
correlation_avg_g3d_mark_price = full_relation['gpu_specs_avg_g3d_mark'].corr(full_relation['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between avg_g3d_mark and price: {correlation_avg_g3d_mark_price:.2f}")
Correlation between avg_g3d_mark and price: 0.59
InĀ [46]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation, x='gpu_specs_avg_g3d_mark', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Correlation between Avg G3D Mark and Price", fontsize=16)
plt.xlabel("Avg G3D Mark", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Random Access Memory (RAM)¶

Basic analysis¶

InĀ [47]:
# Print unique values and their counts for RAM amount
ram_amount_counts = full_relation['laptop_specs_ram_amount'].value_counts()
print("Unique RAM amounts and their counts:")
print(ram_amount_counts)

# Print unique values and their counts for RAM type
ram_type_counts = full_relation['laptop_specs_ram_type'].value_counts()
print("\nUnique RAM types and their counts:")
print(ram_type_counts)
Unique RAM amounts and their counts:
laptop_specs_ram_amount
16.0     1202
8.0       459
32.0      182
24.0       46
4.0        19
12.0       16
36.0       14
64.0        9
48.0        4
18.0        4
96.0        1
128.0       1
Name: count, dtype: int64

Unique RAM types and their counts:
laptop_specs_ram_type
ddr5    1043
ddr4     752
Name: count, dtype: int64
InĀ [48]:
# Convert RAM amount to categorical type
full_relation['laptop_specs_ram_amount'] = pd.Categorical(full_relation['laptop_specs_ram_amount'])

# Plot the unique values and their counts horizontally
plt.figure(figsize=(12, 8))
ax = sns.barplot(x=ram_amount_counts.index.astype(int).astype(str), y=ram_amount_counts.values, palette="viridis")
plt.title("Number of Laptops by RAM Amount", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("RAM Amount (GB)", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
No description has been provided for this image
InĀ [49]:
# Plot the pie chart for RAM types
plt.figure(figsize=(8, 8))
ram_type_counts.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=['#66b3ff','#99ff99'], labels=ram_type_counts.index, wedgeprops=dict(width=0.3))

# Add title
plt.title("Distribution of RAM Types", fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Analyzing RAM performance relation with price¶

InĀ [50]:
# Calculate the correlation between RAM amount and price
correlation_ram_price = full_relation['laptop_specs_ram_amount'].astype(float).corr(full_relation['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between RAM amount and price: {correlation_ram_price:.2f}")
Correlation between RAM amount and price: 0.66
InĀ [51]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by RAM amount
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, x='laptop_specs_ram_amount', y='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by RAM Amount", fontsize=16)
plt.xlabel("RAM Amount (GB)", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [52]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a KDE plot for price distribution by RAM type
plt.figure(figsize=(14, 8))
sns.kdeplot(data=full_relation, x='laptop_specs_price', hue='laptop_specs_ram_type', fill=True, palette="viridis")

# Add titles and labels
plt.title("Price Distribution by RAM Type", fontsize=16)
plt.xlabel("Price", fontsize=14)
plt.ylabel("Density", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Storage¶

Basic analysis¶

InĀ [53]:
# Clone the full_relation DataFrame
full_relation_clone = full_relation.copy()
# Convert 'laptop_specs_storage_amount' to numeric type
full_relation_clone['laptop_specs_storage_amount'] = pd.to_numeric(full_relation_clone['laptop_specs_storage_amount'], errors='coerce')

# Filter the DataFrame
full_relation_clone = full_relation_clone[full_relation_clone['laptop_specs_storage_amount'] >= 128]

# Print unique values and their counts for storage amount
storage_amount_counts = full_relation_clone['laptop_specs_storage_amount'].value_counts()
print("Unique storage amounts and their counts:")
print(storage_amount_counts)

# Print unique values and their counts for storage type
storage_type_counts = full_relation_clone['laptop_specs_storage_type'].value_counts()
print("\nUnique storage types and their counts:")
print(storage_type_counts)
Unique storage amounts and their counts:
laptop_specs_storage_amount
512.0     1227
1024.0     338
256.0      121
2048.0      14
8192.0       1
Name: count, dtype: int64

Unique storage types and their counts:
laptop_specs_storage_type
ssd    1636
hdd       6
Name: count, dtype: int64
InĀ [54]:
# Convert storage amount to categorical type
full_relation_clone['laptop_specs_storage_amount'] = pd.Categorical(full_relation_clone['laptop_specs_storage_amount'])

# Plot the unique values and their counts horizontally
storage_amount_counts = full_relation_clone['laptop_specs_storage_amount'].value_counts()
plt.figure(figsize=(12, 8))
ax = sns.barplot(x=storage_amount_counts.index.astype(int).astype(str), y=storage_amount_counts.values, palette="viridis")
plt.title("Number of Laptops by Storage Amount", fontsize=16)
plt.xlabel("Storage Amount (GB)", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
No description has been provided for this image

Analyzing Storage relation with price¶

InĀ [55]:
# Calculate the correlation between storage amount and price
correlation_storage_price = full_relation_clone['laptop_specs_storage_amount'].astype(float).corr(full_relation_clone['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between storage amount and price: {correlation_storage_price:.2f}")
Correlation between storage amount and price: 0.56
InĀ [56]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by storage amount
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation_clone, x='laptop_specs_storage_amount', y='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Storage Amount", fontsize=16)
plt.xlabel("Storage Amount (GB)", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Screen Features¶

Basic analysis¶

InĀ [57]:
# Calculate summary statistics for screen size, refresh rate, and brightness
screen_size_stats = full_relation['laptop_specs_screen_size'].describe()
refresh_rate_stats = full_relation['laptop_specs_screen_refresh_rate'].describe()
brightness_stats = full_relation['laptop_specs_screen_brightness'].describe()

# Print the results
print("Summary Statistics for Screen Size:")
print(screen_size_stats)

print("\nSummary Statistics for Screen Refresh Rate:")
print(refresh_rate_stats)

print("\nSummary Statistics for Screen Brightness:")
print(brightness_stats)
Summary Statistics for Screen Size:
count    1707.000000
mean       14.971822
std         0.959482
min        13.000000
25%        14.000000
50%        15.600000
75%        15.600000
max        18.000000
Name: laptop_specs_screen_size, dtype: float64

Summary Statistics for Screen Refresh Rate:
count    1266.000000
mean      109.504739
std        47.309397
min        60.000000
25%        60.000000
50%       120.000000
75%       144.000000
max       480.000000
Name: laptop_specs_screen_refresh_rate, dtype: float64

Summary Statistics for Screen Brightness:
count    1148.000000
mean      333.719512
std       103.811275
min       220.000000
25%       250.000000
50%       300.000000
75%       400.000000
max      1200.000000
Name: laptop_specs_screen_brightness, dtype: float64
InĀ [58]:
# Print unique values and their counts for screen resolution
screen_resolution_counts = full_relation['laptop_specs_screen_resolution'].value_counts()
print("Unique screen resolutions and their counts:")
print(screen_resolution_counts)

# Plot the unique values and their counts
plt.figure(figsize=(12, 8))
ax = sns.barplot(y=screen_resolution_counts.index, x=screen_resolution_counts.values, palette="viridis")
plt.title("Number of Laptops by Screen Resolution", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Screen Resolution", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
Unique screen resolutions and their counts:
laptop_specs_screen_resolution
1920x1080    978
1920x1200    406
2880x1800    156
2560x1600    144
3024x1964     31
2880x1864     29
2560x1664     21
2880x1920     16
2880x1620     13
3840x2400     12
2560x1644     12
3200x2000     12
3456x2234     11
2560x1440     10
2240x1400      7
3072x1920      6
2048x1280      6
1366x768       5
3456x2160      2
2960x1848      1
2220x1080      1
3201x2000      1
2256x1504      1
3000x2000      1
3840x2160      1
2160x1440      1
Name: count, dtype: int64
No description has been provided for this image

Analysis of screen features with price¶

InĀ [59]:
# Clone the full_relation DataFrame
full_relation_clone = full_relation.copy()

# Get the counts of each screen resolution
screen_resolution_counts = full_relation_clone['laptop_specs_screen_resolution'].value_counts()

# Filter out screen resolutions with count < 20
filtered_screen_resolutions = screen_resolution_counts[screen_resolution_counts >= 20].index

# Filter the DataFrame
full_relation_clone = full_relation_clone[full_relation_clone['laptop_specs_screen_resolution'].isin(filtered_screen_resolutions)]

# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by screen resolution
# Sort the DataFrame by screen resolution
full_relation_clone = full_relation_clone.sort_values(by='laptop_specs_screen_resolution')

plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation_clone, y='laptop_specs_screen_resolution', x='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Screen Resolution", fontsize=16)
plt.ylabel("Screen Resolution", fontsize=14)
plt.xlabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [60]:
# Print the correlation
correlation_screen_size_price = full_relation['laptop_specs_screen_size'].corr(full_relation['laptop_specs_price'])
correlation_refresh_rate_price = full_relation['laptop_specs_screen_refresh_rate'].corr(full_relation['laptop_specs_price'])
correlation_brightness_price = full_relation['laptop_specs_screen_brightness'].corr(full_relation['laptop_specs_price'])

print(f"Correlation between screen size and price: {correlation_screen_size_price:.2f}")
print(f"Correlation between screen refresh rate and price: {correlation_refresh_rate_price:.2f}")
print(f"Correlation between screen brightness and price: {correlation_brightness_price:.2f}")

# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot price vs. screen size
sns.regplot(data=full_relation, x='laptop_specs_screen_size', y='laptop_specs_price', ax=axes[0], color='blue', scatter_kws={'alpha':0.7})
axes[0].set_title('Price vs. Screen Size')
axes[0].set_xlabel('Screen Size (inches)')
axes[0].set_ylabel('Price (VND)')

# Plot price vs. screen refresh rate
sns.regplot(data=full_relation, x='laptop_specs_screen_refresh_rate', y='laptop_specs_price', ax=axes[1], color='green', scatter_kws={'alpha':0.7})
axes[1].set_title('Price vs. Screen Refresh Rate')
axes[1].set_xlabel('Screen Refresh Rate (Hz)')
axes[1].set_ylabel('Price (VND)')

# Plot price vs. screen brightness
sns.regplot(data=full_relation, x='laptop_specs_screen_brightness', y='laptop_specs_price', ax=axes[2], color='red', scatter_kws={'alpha':0.7})
axes[2].set_title('Price vs. Screen Brightness')
axes[2].set_xlabel('Screen Brightness (nits)')
axes[2].set_ylabel('Price (VND)')

plt.tight_layout()
plt.show()
Correlation between screen size and price: 0.06
Correlation between screen refresh rate and price: 0.29
Correlation between screen brightness and price: 0.50
No description has been provided for this image

Portability Features¶

Weight¶

Basic analysis

InĀ [61]:
# Print summary statistics for weight
weight_stats = full_relation['laptop_specs_weight'].describe()
print("Summary Statistics for Weight:")
print(weight_stats)
Summary Statistics for Weight:
count    1577.000000
mean        1.728397
std         0.415082
min         0.879000
25%         1.400000
50%         1.650000
75%         2.000000
max         4.000000
Name: laptop_specs_weight, dtype: float64
InĀ [62]:
# Plot the distribution of laptop weights
plt.figure(figsize=(10, 6))
sns.histplot(full_relation['laptop_specs_weight'].dropna(), kde=True, color='blue', bins=30)

# Add labels and title
plt.title("Distribution of Laptop Weights", fontsize=16)
plt.xlabel("Weight (kg)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image

Analysis of weight with price

InĀ [63]:
# Calculate the correlation between weight and price
correlation_weight_price = full_relation['laptop_specs_weight'].corr(full_relation['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between weight and price: {correlation_weight_price:.2f}")
Correlation between weight and price: 0.17
InĀ [64]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation, x='laptop_specs_weight', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Weight vs Price", fontsize=16)
plt.xlabel("Weight (kg)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Length, Width, Height¶

Basic analysis

InĀ [65]:
# Calculate summary statistics for length, width, and height
length_stats = full_relation['laptop_specs_height'].describe()
width_stats = full_relation['laptop_specs_width'].describe()
height_stats = full_relation['laptop_specs_depth'].describe()

# Print the results
print("Summary Statistics for Length:")
print(length_stats)

print("\nSummary Statistics for Width:")
print(width_stats)

print("\nSummary Statistics for Height:")
print(height_stats)
Summary Statistics for Length:
count    1498.000000
mean        1.907049
std         0.647121
min         0.930000
25%         1.690000
50%         1.830000
75%         1.990000
max        22.700000
Name: laptop_specs_height, dtype: float64

Summary Statistics for Width:
count    1498.000000
mean       34.120287
std         2.341405
min        28.700000
25%        31.560000
50%        35.610000
75%        35.940000
max        50.500000
Name: laptop_specs_width, dtype: float64

Summary Statistics for Height:
count    1498.000000
mean       23.461589
std         1.766152
min         3.000000
25%        22.100000
50%        23.500000
75%        24.770000
max        31.600000
Name: laptop_specs_depth, dtype: float64
InĀ [66]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot the distribution of length
sns.histplot(full_relation['laptop_specs_height'].dropna(), kde=True, color='blue', bins=30, ax=axes[0])
axes[0].set_title("Distribution of Laptop Length", fontsize=16)
axes[0].set_xlabel("Length (cm)", fontsize=14)
axes[0].set_ylabel("Frequency", fontsize=14)

# Plot the distribution of width
sns.histplot(full_relation['laptop_specs_width'].dropna(), kde=True, color='green', bins=30, ax=axes[1])
axes[1].set_title("Distribution of Laptop Width", fontsize=16)
axes[1].set_xlabel("Width (cm)", fontsize=14)
axes[1].set_ylabel("Frequency", fontsize=14)

# Plot the distribution of height
sns.histplot(full_relation['laptop_specs_depth'].dropna(), kde=True, color='red', bins=30, ax=axes[2])
axes[2].set_title("Distribution of Laptop Height", fontsize=16)
axes[2].set_xlabel("Height (cm)", fontsize=14)
axes[2].set_ylabel("Frequency", fontsize=14)

plt.tight_layout()
plt.show()
No description has been provided for this image

Analysis of dimensions with price

InĀ [67]:
# Calculate the correlation between length, width, height, and price
correlation_length_price = full_relation['laptop_specs_height'].corr(full_relation['laptop_specs_price'])
correlation_width_price = full_relation['laptop_specs_width'].corr(full_relation['laptop_specs_price'])
correlation_height_price = full_relation['laptop_specs_depth'].corr(full_relation['laptop_specs_price'])

# Print the correlation results
print(f"Correlation between length and price: {correlation_length_price:.2f}")
print(f"Correlation between width and price: {correlation_width_price:.2f}")
print(f"Correlation between height and price: {correlation_height_price:.2f}")
Correlation between length and price: -0.00
Correlation between width and price: -0.11
Correlation between height and price: 0.13
InĀ [68]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot length vs price with regression line
sns.regplot(data=full_relation, x='laptop_specs_height', y='laptop_specs_price', ax=axes[0], color='blue', scatter_kws={'s': 10})
axes[0].set_title('Length vs Price')
axes[0].set_xlabel('Length (cm)')
axes[0].set_ylabel('Price (VND)')

# Plot width vs price with regression line
sns.regplot(data=full_relation, x='laptop_specs_width', y='laptop_specs_price', ax=axes[1], color='green', scatter_kws={'s': 10})
axes[1].set_title('Width vs Price')
axes[1].set_xlabel('Width (cm)')
axes[1].set_ylabel('Price (VND)')

# Plot height vs price with regression line
sns.regplot(data=full_relation, x='laptop_specs_depth', y='laptop_specs_price', ax=axes[2], color='red', scatter_kws={'s': 10})
axes[2].set_title('Height vs Price')
axes[2].set_xlabel('Height (cm)')
axes[2].set_ylabel('Price (VND)')

plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [69]:
# Calculate the product of length, width, and height
full_relation_clone['volume'] = full_relation_clone['laptop_specs_height'] * full_relation_clone['laptop_specs_width'] * full_relation_clone['laptop_specs_depth']

# Calculate the correlation between volume and price
correlation_volume_price = full_relation_clone['volume'].corr(full_relation_clone['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between volume and price: {correlation_volume_price:.2f}")

# Plot the correlation between volume and price
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation_clone, x='volume', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Volume vs Price", fontsize=16)
plt.xlabel("Volume (cm³)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
Correlation between volume and price: 0.03
No description has been provided for this image

Battery and Power¶

Basic Analysis¶

InĀ [70]:
# Calculate summary statistics for battery amount and battery cells
battery_amount_stats = full_relation['laptop_specs_battery_capacity'].describe()
battery_cells_stats = full_relation['laptop_specs_battery_cells'].describe()

# Print the results
print("Summary Statistics for Battery Capacity:")
print(battery_amount_stats)

print("\nSummary Statistics for Battery Cells:")
print(battery_cells_stats)
Summary Statistics for Battery Capacity:
count    1707.000000
mean       58.177510
std        23.062029
min        36.000000
25%        47.000000
50%        55.000000
75%        65.000000
max       800.000000
Name: laptop_specs_battery_capacity, dtype: float64

Summary Statistics for Battery Cells:
count    1236.000000
mean        3.442557
std         0.653190
min         2.000000
25%         3.000000
50%         3.000000
75%         4.000000
max         6.000000
Name: laptop_specs_battery_cells, dtype: float64
InĀ [71]:
# Plot the distribution of battery capacity
plt.figure(figsize=(10, 6))
sns.histplot(full_relation['laptop_specs_battery_capacity'].dropna(), kde=True, color='blue', bins=30)

# Add labels and title
plt.title("Distribution of Laptop Battery Capacity", fontsize=16)
plt.xlabel("Battery Capacity (Wh)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image

Analysis of battery and power features with price

InĀ [72]:
# Calculate the correlation between battery capacity and price
correlation_battery_capacity_price = full_relation['laptop_specs_battery_capacity'].corr(full_relation['laptop_specs_price'])

# Calculate the correlation between battery cells and price
correlation_battery_cells_price = full_relation['laptop_specs_battery_cells'].corr(full_relation['laptop_specs_price'])

# Print the correlation results
print(f"Correlation between battery capacity and price: {correlation_battery_capacity_price:.2f}")
print(f"Correlation between battery cells and price: {correlation_battery_cells_price:.2f}")
Correlation between battery capacity and price: 0.44
Correlation between battery cells and price: 0.59
InĀ [73]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation, x='laptop_specs_battery_capacity', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Battery Capacity vs Price", fontsize=16)
plt.xlabel("Battery Capacity (Wh)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Connectivity Features¶

Basic analysis¶

InĀ [74]:
# Print unique values and their counts for number of USB-A ports
usb_a_counts = full_relation['laptop_specs_number_usb_a_ports'].value_counts()
print("Unique values and counts for number of USB-A ports:")
print(usb_a_counts)

# Print unique values and their counts for number of USB-C ports
usb_c_counts = full_relation['laptop_specs_number_usb_c_ports'].value_counts()
print("\nUnique values and counts for number of USB-C ports:")
print(usb_c_counts)

# Print unique values and their counts for number of HDMI ports
hdmi_counts = full_relation['laptop_specs_number_hdmi_ports'].value_counts()
print("\nUnique values and counts for number of HDMI ports:")
print(hdmi_counts)

# Print unique values and their counts for number of Ethernet ports
ethernet_counts = full_relation['laptop_specs_number_ethernet_ports'].value_counts()
print("\nUnique values and counts for number of Ethernet ports:")
print(ethernet_counts)

# Print unique values and their counts for number of audio jacks
audio_jack_counts = full_relation['laptop_specs_number_audio_jacks'].value_counts()
print("\nUnique values and counts for number of audio jacks:")
print(audio_jack_counts)
Unique values and counts for number of USB-A ports:
laptop_specs_number_usb_a_ports
0.0     850
2.0     441
3.0     241
1.0     171
4.0      41
6.0       4
5.0       3
12.0      3
8.0       2
Name: count, dtype: int64

Unique values and counts for number of USB-C ports:
laptop_specs_number_usb_c_ports
1.0    831
2.0    411
0.0    404
4.0     50
3.0     32
8.0     22
5.0      6
Name: count, dtype: int64

Unique values and counts for number of HDMI ports:
laptop_specs_number_hdmi_ports
1.0    1517
0.0     239
Name: count, dtype: int64

Unique values and counts for number of Ethernet ports:
laptop_specs_number_ethernet_ports
0.0    1470
1.0     286
Name: count, dtype: int64

Unique values and counts for number of audio jacks:
laptop_specs_number_audio_jacks
0.0    1039
1.0     717
Name: count, dtype: int64
InĀ [75]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the figure and axes
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

# Plot the pie chart for number of USB-A ports
usb_a_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[0],
    colors=sns.color_palette('pastel', len(usb_a_counts)),
    labels=None  # Remove labels
)
axes[0].set_title("Distribution of USB-A Ports")
axes[0].set_ylabel('')
axes[0].legend(usb_a_counts.index, title="USB-A Ports", loc="best")

# Plot the pie chart for number of USB-C ports
usb_c_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[1],
    colors=sns.color_palette('pastel', len(usb_c_counts)),
    labels=None  # Remove labels
)
axes[1].set_title("Distribution of USB-C Ports")
axes[1].set_ylabel('')
axes[1].legend(usb_c_counts.index, title="USB-C Ports", loc="best")

# Plot the pie chart for number of HDMI ports
hdmi_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[2],
    colors=sns.color_palette('pastel', len(hdmi_counts)),
    labels=None  # Remove labels
)
axes[2].set_title("Distribution of HDMI Ports")
axes[2].set_ylabel('')
axes[2].legend(hdmi_counts.index, title="HDMI Ports", loc="best")

# Plot the pie chart for number of Ethernet ports
ethernet_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[3],
    colors=sns.color_palette('pastel', len(ethernet_counts)),
    labels=None  # Remove labels
)
axes[3].set_title("Distribution of Ethernet Ports")
axes[3].set_ylabel('')
axes[3].legend(ethernet_counts.index, title="Ethernet Ports", loc="best")

# Plot the pie chart for number of Audio Jacks
audio_jack_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[4],
    colors=sns.color_palette('pastel', len(audio_jack_counts)),
    labels=None  # Remove labels
)
axes[4].set_title("Distribution of Audio Jacks")
axes[4].set_ylabel('')
axes[4].legend(audio_jack_counts.index, title="Audio Jacks", loc="best")

# Remove the last empty subplot
fig.delaxes(axes[5])

# Adjust layout
plt.tight_layout()
plt.show()
No description has been provided for this image

Analysis connectivity to price¶

InĀ [76]:
# Calculate the correlation between connectivity features and price
correlation_usb_a_price = full_relation['laptop_specs_number_usb_a_ports'].corr(full_relation['laptop_specs_price'])
correlation_usb_c_price = full_relation['laptop_specs_number_usb_c_ports'].corr(full_relation['laptop_specs_price'])
correlation_hdmi_price = full_relation['laptop_specs_number_hdmi_ports'].corr(full_relation['laptop_specs_price'])
correlation_ethernet_price = full_relation['laptop_specs_number_ethernet_ports'].corr(full_relation['laptop_specs_price'])
correlation_audio_jack_price = full_relation['laptop_specs_number_audio_jacks'].corr(full_relation['laptop_specs_price'])

# Print the correlation results
print(f"Correlation between number of USB-A ports and price: {correlation_usb_a_price:.2f}")
print(f"Correlation between number of USB-C ports and price: {correlation_usb_c_price:.2f}")
print(f"Correlation between number of HDMI ports and price: {correlation_hdmi_price:.2f}")
print(f"Correlation between number of Ethernet ports and price: {correlation_ethernet_price:.2f}")
print(f"Correlation between number of audio jacks and price: {correlation_audio_jack_price:.2f}")
Correlation between number of USB-A ports and price: -0.14
Correlation between number of USB-C ports and price: 0.00
Correlation between number of HDMI ports and price: -0.18
Correlation between number of Ethernet ports and price: -0.04
Correlation between number of audio jacks and price: 0.03

Software Features¶

Default OS¶

Basic analysis

InĀ [77]:
# Print unique values and their counts for default OS
os_counts = full_relation['laptop_specs_default_os'].value_counts()

# Replace 'window' with 'windows' in the 'laptop_specs_default_os' column
full_relation['laptop_specs_default_os'] = full_relation['laptop_specs_default_os'].apply(lambda x: 'windows' if x is not None and 'window' in x.lower() else x)

# Print the updated unique OS and their counts
os_counts = full_relation['laptop_specs_default_os'].value_counts()
print("Unique OS and their counts:")
print(os_counts)
Unique OS and their counts:
laptop_specs_default_os
windows      1703
macos         140
linux          30
chrome os       2
Name: count, dtype: int64
InĀ [78]:
# Plot the pie chart for default OS
plt.figure(figsize=(8, 8))
os_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    colors=['#66b3ff', '#99ff99', '#ffcc99', '#ff9999'],
    labels=None,  # Remove labels from the pie chart
    wedgeprops=dict(width=0.3),
    textprops={'fontsize': 10}  # Adjust text size
)

# Add a legend for categories
plt.legend(os_counts.index, loc="best")

# Add title
plt.title("Distribution of Default OS", fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Warranty¶

InĀ [79]:
# Print unique values and their counts for warranty
warranty_counts = full_relation['laptop_specs_warranty'].value_counts()
print("Unique warranty values and their counts:")
print(warranty_counts)
Unique warranty values and their counts:
laptop_specs_warranty
12.0    851
24.0    765
36.0     76
18.0      1
Name: count, dtype: int64
InĀ [80]:
# Print correlation
correlation_warranty_price = full_relation['laptop_specs_warranty'].corr(full_relation['laptop_specs_price'])
print(f"Correlation between warranty and price: {correlation_warranty_price:.2f}")

# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by warranty
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, x='laptop_specs_warranty', y='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Warranty", fontsize=16)
plt.xlabel("Warranty (months)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
Correlation between warranty and price: 0.05
No description has been provided for this image

Target Feature: price¶

Basic statistics

InĀ [81]:
# Calculate basic statistics for the price column
price_stats = full_relation['laptop_specs_price'].describe()

# Print the statistics
print("Basic Statistics for Price:")
print(price_stats)
Basic Statistics for Price:
count    1.959000e+03
mean     2.532114e+07
std      1.386554e+07
min      7.890000e+06
25%      1.699000e+07
50%      2.189000e+07
75%      2.939000e+07
max      1.824900e+08
Name: laptop_specs_price, dtype: float64

Visualizing the distribution

InĀ [82]:
# Plot the distribution of laptop prices
plt.figure(figsize=(12, 6))
sns.histplot(full_relation['laptop_specs_price'], kde=True, color='blue', bins=30)

# Add labels and title
plt.title("Distribution of Laptop Prices", fontsize=16)
plt.xlabel("Price (VND)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image
InĀ [83]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=full_relation, x='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Boxplot of Laptop Prices", fontsize=16)
plt.xlabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image